pacman::p_load(psych, xray , ggplot2, texreg, DT, wrapr, dplyr,
sjmisc, sjlabelled, sjstats, sjPlot, tidyr,ggpubr,corrplot,
knitr, kableExtra, captioner, car, excelR,ggcorrplot,Rmisc,sjmisc,funModeling,
forcats, hrbrthemes,tidyverse,finalfit,patchwork,GGally,broom,caret,matrixTests,PerformanceAnalytics)
df = read.csv("Well_being_Health_Behavior_Environmental_Fact.csv")
pacman::p_load("DT")
head(x = df,n=5) %>% datatable(rownames=TRUE,filter = "top", options=list(pageLength = 10, scrollX=T))
pacman::p_load("DT")
tail(x = df,n=5) %>% datatable(rownames=TRUE,filter = "top", options=list(pageLength = 10, scrollX=T))
pacman::p_load("DT","psych")
headTail(x = df,top = 4,bottom = 4) %>% datatable(rownames=TRUE,filter = "top", options=list(pageLength = 10, scrollX=T,scrollY=T))
cat("The Dataset (df) contains ",nrow(df)," records and", ncol(df)," Columns or variables.")
## The Dataset (df) contains 395697 records and 33 Columns or variables.
str(df)
## 'data.frame': 395697 obs. of 33 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 30 64 49 43 68 38 23 61 78 82 ...
## $ wellbeing : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SocialSupport : int 5 4 3 5 4 4 5 3 5 3 ...
## $ race : int 0 1 1 1 1 1 1 1 1 0 ...
## $ income : int 5 5 1 5 2 5 5 NA NA NA ...
## $ GeneralHealth : int 4 4 5 2 5 4 1 3 3 3 ...
## $ PoorPhysicalHealthDays: int 0 5 NA 0 30 0 0 0 4 0 ...
## $ PoorMentalHealthDays : int 0 0 0 0 3 0 0 0 0 0 ...
## $ Asthma : int 0 1 1 0 1 0 0 0 0 0 ...
## $ HealthInsurance : int 1 1 1 1 0 1 1 1 1 1 ...
## $ CVD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ LimitedActivity : int 0 1 1 0 1 1 0 0 0 0 ...
## $ Diabetes : int 0 1 0 0 0 0 0 0 0 0 ...
## $ employ : int 1 7 8 1 8 1 1 7 7 5 ...
## $ BMI : int 3 3 3 2 1 2 2 3 2 1 ...
## $ HeavyDrinker : int 0 0 0 1 0 0 0 0 0 0 ...
## $ CurrentSmoker : int 0 0 1 0 0 0 0 0 0 0 ...
## $ PoorSleepDays : int 1 30 0 0 14 30 0 2 0 2 ...
## $ PhysicalActivity : int 1 0 0 1 0 0 1 1 1 0 ...
## $ marital : int 1 1 0 1 1 1 1 1 1 0 ...
## $ PrematureMortality : num 440 440 440 440 440 ...
## $ HouseholdIncome : int 48863 48863 48863 48863 48863 48863 48863 48863 48863 48863 ...
## $ ParkAccess : int 20 20 20 20 20 20 20 20 20 20 ...
## $ CrimeRate : int 300 300 300 300 300 300 300 300 300 300 ...
## $ UnemploymentRate : num 8 8 8 8 8 8 8 8 8 8 ...
## $ WaterSafety : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HighSchoolRate : int 80 80 80 80 80 80 80 80 80 80 ...
## $ SomeCollegeRate : num 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 ...
## $ AccessToRecFacility : num 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 ...
## $ FastFoodPercentage : int 52 52 52 52 52 52 52 52 52 52 ...
## $ PM2.5 : num 13.1 13.1 13.1 13.1 13.1 ...
## $ PopulationDensity : num 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 ...
The Dataset originally has only numerical and integer
values
Our Target variable “wellbeing” is also an integer datatype.
However, we expect that into a category as we conceive it as
classification thing of Wellbeing in the form of category as (yes or no)
or (good or bad).
2.Action
: Convert “Wellbeing” into a factor
It is also evident by looking at the structure of the dataset
that several other variables like (Asthma, HealthInsurance, CVD,
Diabetes, HeavyDrinker) to name a few are supposed to be categories or
factors but infact they are integers.
3.Action : Convert those set of
Categorical Variables as Factors from Integers.
Some set of variables are of the perfect type for example - Age is an integer and is supposed to be one. PM2.5 which represents the Fine Particle Matter in micrograms/cubic meter is actually a numerical data in a continous form.
Some Variables such as BMI and income were expected to be continuous. However, they are ordinal values mentioned in integers which is a point of concern
5. Action : Convert those variables into probable factors whenver needed in Analytics
summary(df)
## X age wellbeing SocialSupport
## Min. : 1 Min. : 7.00 Min. :0.000 Min. :1.000
## 1st Qu.: 98925 1st Qu.:45.00 1st Qu.:0.000 1st Qu.:4.000
## Median :197849 Median :57.00 Median :0.000 Median :5.000
## Mean :197849 Mean :56.32 Mean :0.055 Mean :4.181
## 3rd Qu.:296773 3rd Qu.:69.00 3rd Qu.:0.000 3rd Qu.:5.000
## Max. :395697 Max. :99.00 Max. :1.000 Max. :5.000
## NA's :17025 NA's :20998
## race income GeneralHealth PoorPhysicalHealthDays
## Min. :0.000 Min. :1.00 Min. :1.000 Min. : 0.000
## 1st Qu.:1.000 1st Qu.:2.00 1st Qu.:2.000 1st Qu.: 0.000
## Median :1.000 Median :4.00 Median :3.000 Median : 0.000
## Mean :0.806 Mean :3.61 Mean :2.586 Mean : 4.472
## 3rd Qu.:1.000 3rd Qu.:5.00 3rd Qu.:3.000 3rd Qu.: 3.000
## Max. :1.000 Max. :5.00 Max. :5.000 Max. :30.000
## NA's :5309 NA's :54657 NA's :1555 NA's :9341
## PoorMentalHealthDays Asthma HealthInsurance CVD
## Min. : 0.000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.000
## Median : 0.000 Median :0.0000 Median :1.0000 Median :0.000
## Mean : 3.497 Mean :0.0936 Mean :0.8955 Mean :0.067
## 3rd Qu.: 2.000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :30.000 Max. :1.0000 Max. :1.0000 Max. :1.000
## NA's :7361 NA's :2590 NA's :991 NA's :3928
## LimitedActivity Diabetes employ BMI
## Min. :0.0000 Min. :0.0000 Min. :1.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:1.000
## Median :0.0000 Median :0.0000 Median :3.000 Median :2.000
## Mean :0.2723 Mean :0.1274 Mean :3.885 Mean :1.933
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:7.000 3rd Qu.:3.000
## Max. :1.0000 Max. :1.0000 Max. :8.000 Max. :3.000
## NA's :1858 NA's :395 NA's :1322 NA's :17290
## HeavyDrinker CurrentSmoker PoorSleepDays PhysicalActivity
## Min. :0.000 Min. :0.0000 Min. : 0.000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.: 0.000 1st Qu.:0.0000
## Median :0.000 Median :0.0000 Median : 3.000 Median :1.0000
## Mean :0.046 Mean :0.1583 Mean : 7.707 Mean :0.7303
## 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:12.000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.0000 Max. :30.000 Max. :1.0000
## NA's :10742 NA's :2463 NA's :7560 NA's :533
## marital PrematureMortality HouseholdIncome ParkAccess
## Min. :0.0000 Min. :133.5 Min. : 22289 Min. : 1.00
## 1st Qu.:0.0000 1st Qu.:294.0 1st Qu.: 41103 1st Qu.: 13.00
## Median :1.0000 Median :345.0 Median : 47930 Median : 32.00
## Mean :0.5552 Mean :355.8 Mean : 50177 Mean : 34.58
## 3rd Qu.:1.0000 3rd Qu.:405.7 3rd Qu.: 56166 3rd Qu.: 52.00
## Max. :1.0000 Max. :883.5 Max. :119525 Max. :100.00
## NA's :1559 NA's :9633
## CrimeRate UnemploymentRate WaterSafety HighSchoolRate
## Min. : 12.0 Min. : 1.100 Min. : 0.000 Min. : 27.0
## 1st Qu.: 202.0 1st Qu.: 7.000 1st Qu.: 0.000 1st Qu.: 74.0
## Median : 343.0 Median : 8.400 Median : 1.000 Median : 80.0
## Mean : 405.6 Mean : 8.716 Mean : 6.511 Mean : 79.1
## 3rd Qu.: 558.0 3rd Qu.:10.200 3rd Qu.: 5.000 3rd Qu.: 86.0
## Max. :2062.0 Max. :29.700 Max. :100.000 Max. :100.0
## NA's :4174 NA's :5039 NA's :16
## SomeCollegeRate AccessToRecFacility FastFoodPercentage PM2.5
## Min. :23.20 Min. : 0.00 Min. : 8.00 Min. : 6.50
## 1st Qu.:54.00 1st Qu.: 6.90 1st Qu.: 44.00 1st Qu.:10.86
## Median :61.20 Median : 9.80 Median : 50.00 Median :11.74
## Mean :60.45 Mean :10.13 Mean : 48.33 Mean :11.63
## 3rd Qu.:67.70 3rd Qu.:13.00 3rd Qu.: 54.00 3rd Qu.:12.78
## Max. :90.40 Max. :57.50 Max. :100.00 Max. :14.78
## NA's :42
## PopulationDensity
## Min. : 1.7
## 1st Qu.: 66.3
## Median : 276.9
## Mean : 1186.5
## 3rd Qu.: 991.3
## Max. :69468.4
##
The Summary provides a good support of evidence for those points
of concerns expressed earlier in the structure of the dataset. It
describes the standard statistics for those variables which are supposed
to be categorical variables either ordinal or nominal
The Summary indicates
something interesting that many records and most variables in the
dataset contains null values.
Action : Either Remove
or Impute Null Values ensuring that in each of the case does not
statistically significantly alters the dataset.
The Good part that the summary iterates is that the almost all variables are within their supposed range. For an instance, no age record is negative, No PM2.5 less than 0, Household income min is not negative and so on. This is a sigh of relief.
Below are some boxplots for concerning continuous and discrete variables demonstrating their ranges with outliers
This boxplot for age seems very healthy.
boxplot(df["PrematureMortality"],main = "Premature Mortality Box Plot", xlab = "Premature Mortality",horizontal = TRUE)
The distribution of the Premature Mortality have some outliers in the right ends shows that it is skewed to the right.
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=PrematureMortality)) + geom_density()+
ggtitle ("Premature Mortality Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.
From the above test, we could possibly reject the null hypothesis that the distribution of premature Mortality is normal. However through Central Limit Theorem I shall prove the normality.
s30 <- c()
s50 <- c()
s500 <- c()
n =396000
for ( i in 1:n){
s30[i] = mean(sample(df$PrematureMortality,30, replace = TRUE))
s50[i] = mean(sample(df$PrematureMortality,50, replace = TRUE))
s500[i] = mean(sample(df$PrematureMortality,500, replace = TRUE))
}
par(mfrow=c(1,3))
hist(s30, col ="lightblue",main="Sample size=30",xlab ="Premature Mortality")
abline(v = mean(s30), col = "red")
hist(s50, col ="lightgreen", main="Sample size=50",xlab ="Premature Mortality")
abline(v = mean(s50), col = "red")
hist(s500, col ="orange",main="Sample size=500",xlab ="Premature Mortality")
abline(v = mean(s500), col = "red")
We could clearly see a bell curve with Central Limit Theorem and prove that with sufficiently large sample size any distribution that does not seem to be normally distributed gets normally distributed using Central Limit Theorem or Bootstrapping.
boxplot(df["HouseholdIncome"],main = "Household Income Box Plot", xlab = "Household Income",horizontal = TRUE)
The distribution of the Household Income have some outliers in the right
ends shows that it is skewed to the right.
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=HouseholdIncome)) + geom_density()+
ggtitle ("Household Income Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
The distribution does not seems normal but I suspect that when we apply
the CLT or Bootstrapping Normality would be proven.
So from now on we shall not Check with the CLT or Bootstrapping for the normal distribution of any variable rather assume that the distribution will be normal whenever viewed from the lens of CLT or Bootstrapping.
boxplot(df["UnemploymentRate"],main = "Unemployment Rate Box Plot", xlab = "Unemployment Rate ",horizontal = TRUE)
The distribution of the Unemployment Rate have some outliers on both
ends more probably on the right which signifies that it is skewed more
to the right.
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=UnemploymentRate)) + geom_density()+
ggtitle ("Unemployment Rate Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.
boxplot(df["HighSchoolRate"],main = "High School Rate Box Plot", xlab = "High School Rate",horizontal = TRUE)
The distribution of the High School Rate have some outliers on left ends
which signifies that it is skewed more to the left.
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=HighSchoolRate)) + geom_density()+
ggtitle ("High School Rate Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
## Warning: Removed 16 rows containing non-finite values (`stat_density()`).
The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.
boxplot(df["SomeCollegeRate"],main = "Some College Rate Box Plot", xlab = "Some College Rate ",horizontal = TRUE)
The distribution of the Some College Rate have some outliers on left
ends which signifies that it is skewed more to the left.
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=SomeCollegeRate)) + geom_density()+
ggtitle ("Some College Rate Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.
boxplot(df["AccessToRecFacility"],main = "Access To Recreational Facility Box Plot", xlab = "Access To Recreational Facility ",horizontal = TRUE)
The distribution of the Access To Recreational Facility have some
outliers on right ends which signifies that it is skewed more to the
left.
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=AccessToRecFacility)) + geom_density()+
ggtitle ("Access To Recreational Facility Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.
boxplot(df["FastFoodPercentage"],main = "Fast Food Percentage Box Plot", xlab = "Fast Food Percentage ",horizontal = TRUE)
The distribution of the Fast Food Percentage have some outliers on both
ends
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=FastFoodPercentage)) + geom_density()+
ggtitle ("Fast Food Percentage Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
## Warning: Removed 42 rows containing non-finite values (`stat_density()`).
The distribution seems normal with multiple peaks and higher kurtosis
boxplot(df["PM2.5"],main = "PM2.5 Box Plot", xlab = "PM2.5 ",horizontal = TRUE)
The distribution of the PM2.5 have some outliers on left ends which
signifies that it is skewed more to the left.
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=PM2.5)) + geom_density()+
ggtitle ("PM2.5 Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.
boxplot(df["PopulationDensity"],main = "Population Density Box Plot", xlab = "Population Density ",horizontal = TRUE)
The distribution of the Population Density have some outliers on right
ends which signifies that it is skewed more to the right
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=PopulationDensity)) + geom_density()+
ggtitle ("Population Density Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.
As far as continuous variables are concerned their
distribution is quite questionable however, with Central Limit Theorem
it could be deduced that they are representative of their population.
Also the boxplot help us indicates that the variables are in their
expected range and have acceptable set of outliers and shall help us
validate the min, max, median and range of those variables.
apply(apply(df,2,is.na),2,sum)
## X age wellbeing
## 0 0 17025
## SocialSupport race income
## 20998 5309 54657
## GeneralHealth PoorPhysicalHealthDays PoorMentalHealthDays
## 1555 9341 7361
## Asthma HealthInsurance CVD
## 2590 991 3928
## LimitedActivity Diabetes employ
## 1858 395 1322
## BMI HeavyDrinker CurrentSmoker
## 17290 10742 2463
## PoorSleepDays PhysicalActivity marital
## 7560 533 1559
## PrematureMortality HouseholdIncome ParkAccess
## 0 0 9633
## CrimeRate UnemploymentRate WaterSafety
## 4174 0 5039
## HighSchoolRate SomeCollegeRate AccessToRecFacility
## 16 0 0
## FastFoodPercentage PM2.5 PopulationDensity
## 42 0 0
library(tidyr)
pacman::p_load(inspectdf)
df %>% inspect_na %>% show_plot
sum(apply(df,1,anyNA))
## [1] 119348
cat("There are around ",sum(apply(df,1,anyNA)), " null records in the dataset of in total", nrow(df),"records \n","In case we delete all of the", sum(apply(df,1,anyNA)),"records with missing values then we are actually losing around", 100*(sum(apply(df,1,anyNA))/nrow(df)),"% of the total dataset.")
## There are around 119348 null records in the dataset of in total 395697 records
## In case we delete all of the 119348 records with missing values then we are actually losing around 30.16146 % of the total dataset.
df.omit <- na.omit(df)
pacman::p_load(psych,DT)
headTail(df.omit, 10, 10) %>% datatable( rownames = FALSE, filter="top", options = list(pageLength = 21, scrollX=T))
cat(nrow(df.omit),ncol(df.omit))
## 276349 33
#write.csv(df.omit,file="df.omit.csv")
Due to large Sample size everything starts to make sense which can be seen by the p-value. Even after it shows statistically significant difference in the dataset before and after deleting null values. We tend to ignore the statistical evidence and stick to the fact that we have quite enough sample size to represent the population as a whole.
str(df.omit)
## 'data.frame': 276349 obs. of 33 variables:
## $ X : int 1 2 4 5 6 7 12 13 14 15 ...
## $ age : int 30 64 43 68 38 23 44 64 75 70 ...
## $ wellbeing : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SocialSupport : int 5 4 5 4 4 5 4 3 1 5 ...
## $ race : int 0 1 1 1 1 1 1 0 1 1 ...
## $ income : int 5 5 5 2 5 5 5 3 3 4 ...
## $ GeneralHealth : int 4 4 2 5 4 1 5 4 1 2 ...
## $ PoorPhysicalHealthDays: int 0 5 0 30 0 0 21 3 0 0 ...
## $ PoorMentalHealthDays : int 0 0 0 3 0 0 14 2 0 0 ...
## $ Asthma : int 0 1 0 1 0 0 1 0 0 0 ...
## $ HealthInsurance : int 1 1 1 0 1 1 1 1 1 1 ...
## $ CVD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ LimitedActivity : int 0 1 0 1 1 0 1 0 0 0 ...
## $ Diabetes : int 0 1 0 0 0 0 0 0 0 0 ...
## $ employ : int 1 7 1 8 1 1 7 7 7 7 ...
## $ BMI : int 3 3 2 1 2 2 2 2 2 2 ...
## $ HeavyDrinker : int 0 0 1 0 0 0 0 0 0 0 ...
## $ CurrentSmoker : int 0 0 0 0 0 0 0 0 1 0 ...
## $ PoorSleepDays : int 1 30 0 14 30 0 15 10 0 15 ...
## $ PhysicalActivity : int 1 0 1 0 0 1 0 1 1 1 ...
## $ marital : int 1 1 1 1 1 1 0 1 0 0 ...
## $ PrematureMortality : num 440 440 440 440 440 ...
## $ HouseholdIncome : int 48863 48863 48863 48863 48863 48863 48863 48863 48863 48863 ...
## $ ParkAccess : int 20 20 20 20 20 20 20 20 20 20 ...
## $ CrimeRate : int 300 300 300 300 300 300 300 300 300 300 ...
## $ UnemploymentRate : num 8 8 8 8 8 8 8 8 8 8 ...
## $ WaterSafety : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HighSchoolRate : int 80 80 80 80 80 80 80 80 80 80 ...
## $ SomeCollegeRate : num 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 ...
## $ AccessToRecFacility : num 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 ...
## $ FastFoodPercentage : int 52 52 52 52 52 52 52 52 52 52 ...
## $ PM2.5 : num 13.1 13.1 13.1 13.1 13.1 ...
## $ PopulationDensity : num 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 ...
## - attr(*, "na.action")= 'omit' Named int [1:119348] 3 8 9 10 11 22 26 27 29 32 ...
## ..- attr(*, "names")= chr [1:119348] "3" "8" "9" "10" ...
Factorizing
variables_recoded = c("wellbeing","SocialSupport","race","income","GeneralHealth","Asthma","HealthInsurance","CVD","LimitedActivity","Diabetes","employ","BMI","HeavyDrinker","CurrentSmoker","PhysicalActivity","marital")
for (i in variables_recoded){
df.omit[,i] <- as.factor(df.omit[,i])
}
Checking the Status
str(df.omit)
## 'data.frame': 276349 obs. of 33 variables:
## $ X : int 1 2 4 5 6 7 12 13 14 15 ...
## $ age : int 30 64 43 68 38 23 44 64 75 70 ...
## $ wellbeing : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SocialSupport : Factor w/ 5 levels "1","2","3","4",..: 5 4 5 4 4 5 4 3 1 5 ...
## $ race : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 1 2 2 ...
## $ income : Factor w/ 5 levels "1","2","3","4",..: 5 5 5 2 5 5 5 3 3 4 ...
## $ GeneralHealth : Factor w/ 5 levels "1","2","3","4",..: 4 4 2 5 4 1 5 4 1 2 ...
## $ PoorPhysicalHealthDays: int 0 5 0 30 0 0 21 3 0 0 ...
## $ PoorMentalHealthDays : int 0 0 0 3 0 0 14 2 0 0 ...
## $ Asthma : Factor w/ 2 levels "0","1": 1 2 1 2 1 1 2 1 1 1 ...
## $ HealthInsurance : Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 2 2 ...
## $ CVD : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ LimitedActivity : Factor w/ 2 levels "0","1": 1 2 1 2 2 1 2 1 1 1 ...
## $ Diabetes : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
## $ employ : Factor w/ 8 levels "1","2","3","4",..: 1 7 1 8 1 1 7 7 7 7 ...
## $ BMI : Factor w/ 3 levels "1","2","3": 3 3 2 1 2 2 2 2 2 2 ...
## $ HeavyDrinker : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 1 ...
## $ CurrentSmoker : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
## $ PoorSleepDays : int 1 30 0 14 30 0 15 10 0 15 ...
## $ PhysicalActivity : Factor w/ 2 levels "0","1": 2 1 2 1 1 2 1 2 2 2 ...
## $ marital : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 1 2 1 1 ...
## $ PrematureMortality : num 440 440 440 440 440 ...
## $ HouseholdIncome : int 48863 48863 48863 48863 48863 48863 48863 48863 48863 48863 ...
## $ ParkAccess : int 20 20 20 20 20 20 20 20 20 20 ...
## $ CrimeRate : int 300 300 300 300 300 300 300 300 300 300 ...
## $ UnemploymentRate : num 8 8 8 8 8 8 8 8 8 8 ...
## $ WaterSafety : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HighSchoolRate : int 80 80 80 80 80 80 80 80 80 80 ...
## $ SomeCollegeRate : num 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 ...
## $ AccessToRecFacility : num 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 ...
## $ FastFoodPercentage : int 52 52 52 52 52 52 52 52 52 52 ...
## $ PM2.5 : num 13.1 13.1 13.1 13.1 13.1 ...
## $ PopulationDensity : num 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 ...
## - attr(*, "na.action")= 'omit' Named int [1:119348] 3 8 9 10 11 22 26 27 29 32 ...
## ..- attr(*, "names")= chr [1:119348] "3" "8" "9" "10" ...
df.omit <- within(df.omit, {
wellbeing <- Recode(wellbeing, '"1" = "Poor_Wellbeing"; "0" = "Good_Wellbeing"', as.factor=TRUE, to.value="=", interval=":",
separator=";")
})
df.omit <- within(df.omit, {
CurrentSmoker <- Recode(CurrentSmoker, '"1" = "Current Smoker"; "0" = "Not a Current Smoker"; ;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit <- within(df.omit, {
HeavyDrinker <- Recode(HeavyDrinker, '"1" = "Heavy Drinker"; "0" = "Not a Heavy Drinker"; ; ;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit <- within(df.omit, {
HealthInsurance <- Recode(HealthInsurance , '"1" = "With Health Coverage"; "0" = "No Health Coverage"', as.factor=TRUE, to.value="=", interval=":",
separator=";")
})
df.omit <- within(df.omit, {
CVD <- Recode(CVD, '"1" = "Cardiovascular Disease"; "0" = "No Cardiovascular Disease"; ;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit <- within(df.omit, {
Diabetes <- Recode(Diabetes, '"1" = "Diabetic"; "0" = "Non Diabetic"; ; ;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit <- within(df.omit, {
LimitedActivity <- Recode(LimitedActivity , '"1" = "Limited"; "0" = "Not Limited"; ; ;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit <- within(df.omit, {
PhysicalActivity <- Recode(PhysicalActivity , '"1" = "Adequate Activity"; "0" = "Low Activity"; ; ;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit <- within(df.omit, {
employ <- Recode(employ , '"1" = "Waged Employement"; "2" = "Self-Employed";"3" = "Unemployed for more than 1 yr" ;"4" = "Unemployed for less than 1 yr";"5" = "Homemaker";"6" = "Student";"7" = "Retired";"8" = "Unable to work" ;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit <- within(df.omit, {
BMI <- Recode(BMI , '"1" = "Low"; "2" = "Medium";"3" = "High" ;;;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit$income <- recode_factor(df.omit$income,
"1" = "less than 15000",
"2" = "[15000-25000)",
"3" = "[25000-35000)",
"4" = "[35000-50000)",
"5" = "50000 or more")
df.omit$Asthma <- recode_factor(df.omit$Asthma, '0' = "No", '1' = "Yes")
df.omit$SocialSupport <- recode_factor(df.omit$SocialSupport, '1' = "Never", '2' = " Rarely", '3' = " Sometimes", '4' = "Usually", '5' = "Always")
df.omit$race <- recode_factor(df.omit$race, '0' = "Other races", '1' = "White")
df.omit$GeneralHealth <- recode_factor(df.omit$GeneralHealth, '1' = "Excellent",
'2' = "Very good",
'3' = "Good",
'4' = "Fair",
'5' = "Poor")
write.csv(df.omit,"Final_Furbished.csv",row.names = FALSE)
df_final = read.csv("Final_Furbished.csv")
variables_recoded = c("wellbeing","SocialSupport","race","income","GeneralHealth","Asthma","HealthInsurance","CVD","LimitedActivity","Diabetes","employ","BMI","HeavyDrinker","CurrentSmoker","PhysicalActivity","marital")
for (i in variables_recoded){
df_final[,i] <- as.factor(df_final[,i])
}
summary(df_final)
## X age wellbeing SocialSupport
## Min. : 1 Min. : 7.00 Good_Wellbeing:261596 Rarely : 8651
## 1st Qu.: 98518 1st Qu.:44.00 Poor_Wellbeing: 14753 Sometimes: 29784
## Median :196784 Median :56.00 Always :142509
## Mean :197108 Mean :55.43 Never : 12674
## 3rd Qu.:295423 3rd Qu.:67.00 Usually : 82731
## Max. :395697 Max. :99.00
##
## race income GeneralHealth
## Other races: 49945 [15000-25000) : 46105 Excellent:52458
## White :226404 [25000-35000) : 32252 Fair :33796
## [35000-50000) : 42101 Good :81357
## 50000 or more :127462 Poor :14504
## less than 15000: 28429 Very good:94234
##
##
## PoorPhysicalHealthDays PoorMentalHealthDays Asthma
## Min. : 0.000 Min. : 0.000 No :250697
## 1st Qu.: 0.000 1st Qu.: 0.000 Yes: 25652
## Median : 0.000 Median : 0.000
## Mean : 4.201 Mean : 3.435
## 3rd Qu.: 3.000 3rd Qu.: 2.000
## Max. :30.000 Max. :30.000
##
## HealthInsurance CVD
## No Health Coverage : 27420 Cardiovascular Disease : 17975
## With Health Coverage:248929 No Cardiovascular Disease:258374
##
##
##
##
##
## LimitedActivity Diabetes
## Limited : 72410 Diabetic : 32822
## Not Limited:203939 Non Diabetic:243527
##
##
##
##
##
## employ BMI
## Waged Employement :121619 High : 79736
## Retired : 74731 Low : 94694
## Self-Employed : 23167 Medium:101919
## Homemaker : 18497
## Unable to work : 17547
## Unemployed for more than 1 yr: 8846
## (Other) : 11942
## HeavyDrinker CurrentSmoker PoorSleepDays
## Heavy Drinker : 14151 Current Smoker : 44197 Min. : 0.00
## Not a Heavy Drinker:262198 Not a Current Smoker:232152 1st Qu.: 0.00
## Median : 3.00
## Mean : 7.77
## 3rd Qu.:12.00
## Max. :30.00
##
## PhysicalActivity marital PrematureMortality HouseholdIncome
## Adequate Activity:207843 0:116948 Min. :133.5 Min. : 23751
## Low Activity : 68506 1:159401 1st Qu.:290.6 1st Qu.: 41796
## Median :341.3 Median : 48693
## Mean :350.7 Mean : 50816
## 3rd Qu.:400.2 3rd Qu.: 56750
## Max. :883.5 Max. :119525
##
## ParkAccess CrimeRate UnemploymentRate WaterSafety
## Min. : 1.00 Min. : 12.0 Min. : 1.100 Min. : 0.000
## 1st Qu.: 13.00 1st Qu.: 200.0 1st Qu.: 6.900 1st Qu.: 0.000
## Median : 33.00 Median : 343.0 Median : 8.400 Median : 1.000
## Mean : 34.77 Mean : 397.8 Mean : 8.626 Mean : 6.423
## 3rd Qu.: 53.00 3rd Qu.: 544.0 3rd Qu.:10.200 3rd Qu.: 5.000
## Max. :100.00 Max. :1627.0 Max. :29.700 Max. :100.000
##
## HighSchoolRate SomeCollegeRate AccessToRecFacility FastFoodPercentage
## Min. : 27.00 Min. :24.20 Min. : 0.00 Min. : 8.00
## 1st Qu.: 74.00 1st Qu.:54.70 1st Qu.: 7.20 1st Qu.: 44.00
## Median : 81.00 Median :61.90 Median : 9.90 Median : 50.00
## Mean : 79.36 Mean :61.13 Mean :10.34 Mean : 48.22
## 3rd Qu.: 86.00 3rd Qu.:68.30 3rd Qu.:13.00 3rd Qu.: 54.00
## Max. :100.00 Max. :90.40 Max. :51.60 Max. :100.00
##
## PM2.5 PopulationDensity
## Min. : 6.50 Min. : 1.7
## 1st Qu.:10.83 1st Qu.: 69.8
## Median :11.69 Median : 290.2
## Mean :11.59 Mean : 1036.1
## 3rd Qu.:12.78 3rd Qu.: 991.3
## Max. :14.78 Max. :69468.4
##
p1a <- df_final %>%
ggplot(aes(x = SocialSupport, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on different social support level")
p1a
p2 <- df_final %>%
ggplot(aes(x = income, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + theme(axis.text.x = element_text(angle = 30)) + labs(title = "Wellbeing status on different income status")
p2
p3 <- df_final %>%
ggplot(aes(x = race, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on different races")
p3
p4 <- df_final %>%
ggplot(aes(x = Asthma, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status between people having or not Asthma")
p4
p5 <- df_final %>%
ggplot(aes(x = Age_group, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on different age group") + theme(axis.text.x = element_text(angle = 30))
p5
p6 <- df_final %>%
ggplot(aes(x = GeneralHealth, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on different general health status")
p6
p8 <- df_final %>%
ggplot(aes(x = CVD, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on patient with and without CVD")
p8
table(df_final$wellbeing, df_final$race)
##
## Other races White
## Good_Wellbeing 46677 214919
## Poor_Wellbeing 3268 11485
rx.p1 <- df_final %>%
ggplot(aes(x = race, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
rx.p2 <- df_final %>%
ggplot(aes(x = race, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
rx.p1 + rx.p2
There seems to be minimal
visual difference in the percentages of the Poor and Good Wellbeing for
Race Non white and Hispanic and Race White and Hispanic
table(df_final$wellbeing, df_final$HealthInsurance)
##
## No Health Coverage With Health Coverage
## Good_Wellbeing 24204 237392
## Poor_Wellbeing 3216 11537
rx.p1 <- df_final %>%
ggplot(aes(x = HealthInsurance, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
rx.p2 <- df_final %>%
ggplot(aes(x = HealthInsurance, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
rx.p1 + rx.p2
From the Visual Evidence, it
could be said that the Health Insurance affect the Wellbeing of an
individual. It could interpreted from the graph that more proportions of
individuals without Health Insurance possessing poor wellbeing as
compared to those who have Health Coverage.
table(df_final$wellbeing, df_final$CVD)
##
## Cardiovascular Disease No Cardiovascular Disease
## Good_Wellbeing 16348 245248
## Poor_Wellbeing 1627 13126
rx.p1 <- df_final %>%
ggplot(aes(x = CVD, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
rx.p2 <- df_final %>%
ggplot(aes(x = CVD, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
rx.p1 + rx.p2
There seems to be visual
difference in the percentages of the Poor and Good Wellbeing for poeple
with and without CVD
table(df_final$LimitedActivity, df_final$wellbeing)
##
## Good_Wellbeing Poor_Wellbeing
## Limited 63161 9249
## Not Limited 198435 5504
df_final %>%
ggplot(aes(x = LimitedActivity , fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
df_final %>%
ggplot(aes(x = LimitedActivity , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
It could be confirmed visually
that with Low Physical Activitys there is a significant increase in
proportion of the Poor Wellbeing of individuals
table(df_final$wellbeing, df_final$Diabetes)
##
## Diabetic Non Diabetic
## Good_Wellbeing 30091 231505
## Poor_Wellbeing 2731 12022
rx.p1 <- df_final %>%
ggplot(aes(x = Diabetes, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
rx.p2 <- df_final %>%
ggplot(aes(x = Diabetes, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
rx.p1 + rx.p2
There seems to be a visual
difference in the percentages of the Poor and Good Wellbeing between
Diabetic and Non Diabetic individuals.
table(df_final$wellbeing, df_final$employ)
##
## Homemaker Retired Self-Employed Student Unable to work
## Good_Wellbeing 17917 72319 22360 3831 13376
## Poor_Wellbeing 580 2412 807 230 4171
##
## Unemployed for less than 1 yr Unemployed for more than 1 yr
## Good_Wellbeing 6808 7310
## Poor_Wellbeing 1073 1536
##
## Waged Employement
## Good_Wellbeing 117675
## Poor_Wellbeing 3944
df_final %>%
ggplot(aes(x = employ, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
df_final %>%
ggplot(aes(x = employ, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
–> There seems to be
minimal visual difference in the percentages of the Poor and Good
Wellbeing for Homemaker, Retired and Waged Employment
–>
There is relatively higher proportions of Poor Wellbeing in students
than the prior-mentioned employment groups.
–> Individuals who
are unable to work or are unemployed (either more than a year or less
than a year) have significantly higher proportions of Poor Wellbeing
than any other employment status groups.
table(df_final$wellbeing, df_final$BMI)
##
## High Low Medium
## Good_Wellbeing 74077 90142 97377
## Poor_Wellbeing 5659 4552 4542
df_final %>%
ggplot(aes(x = BMI, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
df_final %>%
ggplot(aes(x = BMI, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
Individuals with higher
BMI tend to have slightly higher proportions of “POOR” wellbeing.
table(df_final$PoorSleepDays, df_final$wellbeing)
##
## Good_Wellbeing Poor_Wellbeing
## 0 93450 2530
## 1 8071 164
## 2 21513 488
## 3 14940 418
## 4 9864 322
## 5 20947 734
## 6 3994 181
## 7 5835 311
## 8 2419 108
## 9 251 17
## 10 17947 1009
## 11 61 9
## 12 1630 115
## 13 88 8
## 14 1736 160
## 15 15834 1399
## 16 234 34
## 17 137 24
## 18 269 37
## 19 34 5
## 20 11968 1344
## 21 410 64
## 22 186 34
## 23 79 21
## 24 181 26
## 25 4416 633
## 26 240 36
## 27 294 40
## 28 1421 201
## 29 642 84
## 30 22505 4197
df_final %>%
ggplot(aes(x = PoorSleepDays , fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
df_final %>%
ggplot(aes(x = PoorSleepDays , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
It could be seen visually that
with increment in the Poor Sleep days there is a significant
increase in the Poor Wellbeing of indivduals.
table(df_final$PhysicalActivity, df_final$wellbeing)
##
## Good_Wellbeing Poor_Wellbeing
## Adequate Activity 199787 8056
## Low Activity 61809 6697
x1 = df_final %>%
ggplot(aes(x = PhysicalActivity , fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
x2 =df_final %>%
ggplot(aes(x = PhysicalActivity , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
x1+x2
It could be confirmed visually
that with Low Physical Activitys there is a significant increase in
proportion of the Poor Wellbeing of individuals
table(df_final$wellbeing, df_final$race)
##
## Other races White
## Good_Wellbeing 46677 214919
## Poor_Wellbeing 3268 11485
rx.p1 <- df_final %>%
ggplot(aes(x = race, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
rx.p2 <- df_final %>%
ggplot(aes(x = race, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
rx.p1 + rx.p2
There seems to be minimal
visual difference in the percentages of the Poor and Good Wellbeing for
Heavy Drinker and not a Heavy Drinker.
table(df_final$wellbeing, df_final$CurrentSmoker)
##
## Current Smoker Not a Current Smoker
## Good_Wellbeing 38944 222652
## Poor_Wellbeing 5253 9500
rx.p1 <- df_final %>%
ggplot(aes(x = CurrentSmoker, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
rx.p2 <- df_final %>%
ggplot(aes(x = CurrentSmoker, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
rx.p1 + rx.p2
From the Visual Evidence, it
could be said that the Smoking Status affect the Wellbeing of an
individual. It could interpreted from the graph that more number of
current smokers possess poor wellbeing as compared to those who are not
a current smoker.
table(df_final$wellbeing, df_final$HeavyDrinker)
##
## Heavy Drinker Not a Heavy Drinker
## Good_Wellbeing 13189 248407
## Poor_Wellbeing 962 13791
rx.p1 <- df_final %>%
ggplot(aes(x = HeavyDrinker, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
rx.p2 <- df_final %>%
ggplot(aes(x = HeavyDrinker, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
rx.p1 + rx.p2
There seems to be minimal
visual difference in the percentages of the Poor and Good Wellbeing for
Heavy Drinker and not a Heavy Drinker.
table(df_final$PoorPhysicalHealthDays, df_final$wellbeing)
##
## Good_Wellbeing Poor_Wellbeing
## 0 170204 4631
## 1 12093 417
## 2 15446 735
## 3 8583 544
## 4 4544 291
## 5 7908 552
## 6 1245 114
## 7 4443 351
## 8 779 61
## 9 142 22
## 10 5684 562
## 11 48 6
## 12 509 79
## 13 59 6
## 14 2465 220
## 15 4756 708
## 16 109 35
## 17 68 21
## 18 132 33
## 19 20 8
## 20 2968 591
## 21 574 70
## 22 67 13
## 23 44 9
## 24 52 17
## 25 1197 295
## 26 64 7
## 27 93 25
## 28 436 129
## 29 172 71
## 30 16692 4130
df_final %>%
ggplot(aes(x = PoorPhysicalHealthDays , fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
df_final %>%
ggplot(aes(x = PoorPhysicalHealthDays , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
It could be seen visually that
with increment in the Poor Physical Health days there is a
significant increase in the Poor Wellbeing of indivduals
table(df_final$PoorMentalHealthDays, df_final$wellbeing)
##
## Good_Wellbeing Poor_Wellbeing
## 0 183836 2912
## 1 9224 230
## 2 14293 550
## 3 7969 399
## 4 3897 278
## 5 9624 706
## 6 914 110
## 7 3276 345
## 8 702 78
## 9 88 19
## 10 6492 924
## 11 21 6
## 12 370 77
## 13 36 10
## 14 1191 197
## 15 5147 1155
## 16 61 16
## 17 45 17
## 18 62 28
## 19 9 3
## 20 2960 996
## 21 217 70
## 22 35 21
## 23 22 4
## 24 40 10
## 25 976 422
## 26 33 10
## 27 66 34
## 28 285 122
## 29 188 70
## 30 9517 4934
df_final %>%
ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
df_final %>%
ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
It could be seen visually that
with increment in the Poor Mental Health days there is a significant
increase in the Poor Wellbeing of indivduals.
r1 = df_final %>%
ggplot(aes(x = PoorPhysicalHealthDays , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
r2 = df_final %>%
ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
r1+r2
It is visually prominent that in both cases whenevr there is increase
in the number of Poor Physical Health days or Poor Mental Health Days
there seems to be almost linear increase in the Poor Wellbeing of the
individuals
This also coduld be deduced from the graph that there is a clear association between Poor Health Days (Mental or Physical) and the Wellbeing of an indivdual.
The more interesting part to notice in here is
that The increment in
proportions of “Poor Wellbeing” in the Poor Mental Health days is quite
more there is a significant increase in the Poor Wellbeing of
indivduals.
df_final %>% ggplot(aes(x=PoorPhysicalHealthDays,y=PoorMentalHealthDays,colour =wellbeing)) + geom_point() + geom_abline(intercept=30, slope=-1)
Visual Representation rejects my hypothesis that there is any correlation between the Poor Physical and Poor Mental heath days.
1. No presence of correlation between the Poor Mental and
Poor Health Days
2. If the Graph is colored based on the Well being
of an individual some thing very important starts to pop up :
We can visually bifurcate the presence of “Poor Wellbeing and Good
Wellbeing” with a line y=-x where the upper half denotes More Poor
Physical and More Poor Mental Health days where individauls have Poor
Wellbeing
3. The sparse distribution of points in the upper
half of the line signifies relatively lesser records of individuals who
have higher poor mental or physical health days.
df_final %>% ggplot(aes(x=SomeCollegeRate,y=HighSchoolRate,colour = wellbeing)) + geom_point() + facet_grid(~wellbeing)
There is a clear presence of correlation between the two variables. That indicates that with increase in the College Rate there is parallel increment in School Rate specific to the county.
It is also evident that the distribution more or less remains the same for both wellbeings.
We could reject the possibility of interaction on wellbeing on High School and Some College Rates
df_final %>% ggplot(aes(x=PM2.5,y=PopulationDensity,colour = wellbeing)) + geom_point() + facet_wrap(~wellbeing) +ylim(0,15000)
## Warning: Removed 926 rows containing missing values (`geom_point()`).
There is a clear presence of correlation between the two variables. That indicates with increase in the Population Density there is increment in the PM2.5 specific to the county.
It is also evident that the distribution more or less remains the same for both wellbeings.
We could reject the possibility of interaction on wellbeing on High School and Some College Rates
df_final %>% ggplot(aes(x=SomeCollegeRate,y=HighSchoolRate,colour = wellbeing)) + geom_point() + facet_grid(HeavyDrinker~wellbeing)
# library(corrplot)
# library(sjPlot)
rr_county <- df_final[,c(2,22:33)]
rr_cm <- as.matrix(rr_county)
corr_m <- cor(rr_cm, use="pairwise.complete.obs")
# correlation matrix table
tab_corr(rr_county,
na.deletion="pairwise",
corr.method="pearson",
title="Correlations with pair-wise NA deletion",
show.p=TRUE)
| age | PrematureMortality | HouseholdIncome | ParkAccess | CrimeRate | UnemploymentRate | WaterSafety | HighSchoolRate | SomeCollegeRate | AccessToRecFacility | FastFoodPercentage | PM2.5 | PopulationDensity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| age | 0.023*** | -0.056*** | -0.036*** | -0.007*** | 0.020*** | 0.006** | -0.013*** | -0.038*** | -0.003 | -0.051*** | -0.016*** | -0.015*** | |
| PrematureMortality | 0.023*** | -0.681*** | -0.308*** | 0.398*** | 0.336*** | 0.103*** | -0.402*** | -0.609*** | -0.445*** | 0.276*** | 0.208*** | -0.043*** | |
| HouseholdIncome | -0.056*** | -0.681*** | 0.361*** | -0.211*** | -0.361*** | -0.093*** | 0.366*** | 0.660*** | 0.480*** | -0.034*** | -0.093*** | 0.141*** | |
| ParkAccess | -0.036*** | -0.308*** | 0.361*** | 0.290*** | -0.109*** | -0.137*** | -0.115*** | 0.486*** | 0.220*** | 0.176*** | -0.130*** | 0.373*** | |
| CrimeRate | -0.007*** | 0.398*** | -0.211*** | 0.290*** | 0.268*** | -0.092*** | -0.523*** | -0.067*** | -0.122*** | 0.262*** | 0.001 | 0.273*** | |
| UnemploymentRate | 0.020*** | 0.336*** | -0.361*** | -0.109*** | 0.268*** | -0.048*** | -0.329*** | -0.427*** | -0.311*** | 0.043*** | -0.103*** | 0.019*** | |
| WaterSafety | 0.006** | 0.103*** | -0.093*** | -0.137*** | -0.092*** | -0.048*** | 0.048*** | -0.128*** | -0.064*** | 0.021*** | 0.004* | -0.071*** | |
| HighSchoolRate | -0.013*** | -0.402*** | 0.366*** | -0.115*** | -0.523*** | -0.329*** | 0.048*** | 0.242*** | 0.248*** | -0.157*** | -0.071*** | -0.168*** | |
| SomeCollegeRate | -0.038*** | -0.609*** | 0.660*** | 0.486*** | -0.067*** | -0.427*** | -0.128*** | 0.242*** | 0.530*** | -0.024*** | 0.005* | 0.200*** | |
| AccessToRecFacility | -0.003 | -0.445*** | 0.480*** | 0.220*** | -0.122*** | -0.311*** | -0.064*** | 0.248*** | 0.530*** | -0.262*** | -0.009*** | 0.183*** | |
| FastFoodPercentage | -0.051*** | 0.276*** | -0.034*** | 0.176*** | 0.262*** | 0.043*** | 0.021*** | -0.157*** | -0.024*** | -0.262*** | 0.145*** | -0.012*** | |
| PM2.5 | -0.016*** | 0.208*** | -0.093*** | -0.130*** | 0.001 | -0.103*** | 0.004* | -0.071*** | 0.005* | -0.009*** | 0.145*** | -0.020*** | |
| PopulationDensity | -0.015*** | -0.043*** | 0.141*** | 0.373*** | 0.273*** | 0.019*** | -0.071*** | -0.168*** | 0.200*** | 0.183*** | -0.012*** | -0.020*** | |
| Computed correlation used pearson-method with pairwise-deletion. | |||||||||||||
# correlogram
testRes <- cor.mtest(rr_cm, conf.level = 0.95)
corrplot.mixed(corr_m,
upper = "ellipse",
p.mat = testRes$p,
title="Correlation Matrix of Variables of Interest")
### Highly Correlated Groups
pairs.panels(df_final[, c(23:24,26,29:31)], scale = TRUE, ellipses = FALSE, pch = ".", lm = TRUE, rug = FALSE, stars = TRUE)
# density plots
premort_wb_plot <- df_final %>%
ggdensity( x = "PrematureMortality" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", alpha=0.25)
hincome_wb_plot <- df_final %>%
ggdensity( x = "HouseholdIncome" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", alpha=0.25) +
theme(legend.position="none")
park_wb_plot <- df_final %>%
ggdensity( x = "ParkAccess" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", alpha=0.25) +
theme(legend.position="none")
crime_wb_plot <- df_final %>%
ggdensity( x = "CrimeRate" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", alpha=0.25) +
theme(legend.position="none")
unemp_wb_plot <- df_final %>%
ggdensity( x = "UnemploymentRate" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", alpha=0.25)
water_wb_plot <- df_final %>%
ggdensity( x = "WaterSafety" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", alpha=0.25) +
theme(legend.position="none")
rec_wb_plot <- df_final %>%
ggdensity( x = "AccessToRecFacility" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", alpha=0.25) +
theme(legend.position="none")
fastfood_wb_plot <- df_final %>%
ggdensity( x = "FastFoodPercentage" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", alpha=0.25) +
theme(legend.position="none")
# premature mortality
premort_wb_plot + ggtitle("Density plot of county premature mortality by wellbeing") + theme(legend.position="right") + theme_bw()
# county level factors won't changed based on wellbeing, which is individual
df_final %>%
ggplot(aes(x=HouseholdIncome, y=PrematureMortality, group=wellbeing, color=wellbeing)) +
geom_point(shape=1,alpha=0.5) +
theme_bw() +
stat_smooth(method="lm", formula=y~x, geom="smooth", se=TRUE) +
ggtitle("Scatter Plot of County-Level Mean Household Income and\nNumber of Premature Deaths per 100,000") +
facet_grid(~wellbeing)
df_final %>%
ggplot(aes(x=HouseholdIncome, y=PrematureMortality, group=wellbeing, color=wellbeing)) +
geom_point(alpha=0) +
theme_bw() +
stat_smooth(method="lm", formula=y~x, geom="smooth", se=TRUE) +
ggtitle("Trendlines of County-Level Mean Household Income and\nNumber of Premature Deaths per 100,000")
Counties with higher mean household incomes tended to have lower premature mortality rates. This trend appears consistent across individuals who reported having good wellbeing and those that reported having poor wellbeing.
df_final %>%
ggplot(aes(x=income, y=PrematureMortality, group=income, color=income)) +
geom_jitter(shape=1,alpha=0.5) +
theme_bw() +
ggtitle("Jitter Plot of Individual Income Group and\nNumber of Premature Deaths per 100,000") +
facet_grid(~wellbeing)
df_final %>%
ggplot(aes(x=income, y=PrematureMortality, group=income, color=income)) +
geom_boxplot() +
theme_bw() +
ggtitle("Box Plot of Individual Income Group and\nNumber of Premature Deaths per 100,000") +
facet_grid(~wellbeing)
hincome_wb_plot + ggtitle("Density plot of county household income by wellbeing") + theme(legend.position="right") + theme_bw()
df_final %>%
ggplot(aes(x=income, y=HouseholdIncome, group=income, color=income)) +
geom_jitter(shape=1,alpha=0.5) +
theme_bw() +
ggtitle("Jitter Plot of County-Level Mean Household Income and\nIndividual Income Group") +
facet_grid(~wellbeing)
df_final %>%
ggplot(aes(x=income, y=HouseholdIncome, group=income, color=income)) +
geom_boxplot() +
theme_bw() +
ggtitle("Box Plot of County-Level Mean Household Income and\nInividual Income Group") +
facet_grid(~wellbeing)
Among those with good wellbeing, there appear to be more indviduals living in counties with high household incomes.
# bin mean household income to match individual income bins
df_final <- df_final %>%
mutate(HouseholdIncome.bin = cut(HouseholdIncome, breaks=c(0, 24999, 34999, 49999, 119525)))
df_final$HouseholdIncome.bin <- recode_factor(df_final$HouseholdIncome.bin, "(0,2.5e+04]" = "less than 25000", "(2.5e+04,3.5e+04]" = "[25000,35000)", "(3.5e+04,5e+04]" = "[35000,50000)","(5e+04,1.2e+05]" = "50000 or more")
frq(df_final$HouseholdIncome.bin)
## x <categorical>
## # total N=276349 valid N=276349 mean=3.40 sd=0.61
##
## Value | N | Raw % | Valid % | Cum. %
## ---------------------------------------------------
## less than 25000 | 215 | 0.08 | 0.08 | 0.08
## [25000,35000) | 17342 | 6.28 | 6.28 | 6.35
## [35000,50000) | 131804 | 47.69 | 47.69 | 54.05
## 50000 or more | 126988 | 45.95 | 45.95 | 100.00
## <NA> | 0 | 0.00 | <NA> | <NA>
# create equal sized bins
library(funModeling)
df_final$HouseholdIncome.bin.equal <- equal_freq(var=df_final$HouseholdIncome, n_bins=5)
frq(df_final$HouseholdIncome.bin.equal)
## x <categorical>
## # total N=276349 valid N=276349 mean=2.99 sd=1.41
##
## Value | N | Raw % | Valid % | Cum. %
## -------------------------------------------------
## [23751, 40704) | 55598 | 20.12 | 20.12 | 20.12
## [40704, 46233) | 55049 | 19.92 | 19.92 | 40.04
## [46233, 51106) | 55648 | 20.14 | 20.14 | 60.18
## [51106, 60397) | 55294 | 20.01 | 20.01 | 80.18
## [60397,119525] | 54760 | 19.82 | 19.82 | 100.00
## <NA> | 0 | 0.00 | <NA> | <NA>
tab_xtab(df_final$HouseholdIncome.bin, df_final$wellbeing, show.col.prc = TRUE, show.exp = TRUE)
| HouseholdIncome.bin | wellbeing | Total | |
|---|---|---|---|
| Good_Wellbeing | Poor_Wellbeing | ||
| less than 25000 |
204 204 0.1 % |
11 11 0.1 % |
215 215 0.1 % |
| [25000,35000) |
16218 16416 6.2 % |
1124 926 7.6 % |
17342 17342 6.3 % |
| [35000,50000) |
124246 124768 47.5 % |
7558 7036 51.2 % |
131804 131804 47.7 % |
| 50000 or more |
120928 120209 46.2 % |
6060 6779 41.1 % |
126988 126988 46 % |
| Total |
261596 261596 100 % |
14753 14753 100 % |
276349 276349 100 % |
χ2=166.310 · df=3 · Cramer’s V=0.025 · p=0.000 |
# compare againt individual income
df_final %>%
group_by(HouseholdIncome.bin) %>%
ggplot(aes(x = income, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion") +
facet_wrap(~HouseholdIncome.bin) +
theme(axis.text.x = element_text(angle = 45, vjust=1, hjust=1))
park_wb_plot + ggtitle("Density plot of county park access by wellbeing") + theme(legend.position="right") + theme_bw()
crime_wb_plot + ggtitle("Density plot of county crime rate by wellbeing") + theme(legend.position="right") + theme_bw()
unemp_wb_plot + ggtitle("Density plot of county unemployment rate by wellbeing") + theme(legend.position="right") + theme_bw()
water_wb_plot + ggtitle("Density plot of Water Safety Rate by wellbeing") + theme(legend.position="right") + theme_bw()
waterlog_wb_plot <- df.omit %>%
mutate(WaterSafetyLog = log(WaterSafety,10)) %>%
filter(WaterSafetyLog >= 0) %>%
ggdensity( x = "WaterSafetyLog" ,
color = "wellbeing", fill = "wellbeing", alpha=0.25) +
theme(legend.position="right")
waterlog_wb_plot
rec_wb_plot + ggtitle("Density plot of county recreational facility access by wellbeing") + theme(legend.position="right") + theme_bw()
fastfood_wb_plot + ggtitle("Density plot of county fast food restaurant percentage\nby wellbeing") + theme(legend.position="right") + theme_bw()
# BMI
df.omit %>%
ggplot(aes(x=FastFoodPercentage, group=BMI, fill=BMI)) +
geom_density() +
facet_wrap(~BMI) +
theme_bw()
# BMI
df_final %>%
ggplot(aes(x=FastFoodPercentage, group=CVD, fill=CVD)) +
geom_density() +
facet_wrap(~CVD) +
theme_bw()
# BMI
df_final %>%
ggplot(aes(x=FastFoodPercentage, group=GeneralHealth, fill=GeneralHealth)) +
geom_density() +
facet_wrap(~GeneralHealth) +
theme_bw()
# BMI
df_final %>%
ggplot(aes(x=FastFoodPercentage, group=Diabetes, fill=Diabetes)) +
geom_density() +
facet_wrap(~Diabetes) +
theme_bw()
multiplot(premort_wb_plot, hincome_wb_plot, park_wb_plot, crime_wb_plot, unemp_wb_plot, water_wb_plot, rec_wb_plot, fastfood_wb_plot, plotlist=NULL, cols=2)
# set up variables
dependent <- "wellbeing"
pred.all <- c("age","SocialSupport","race","income","GeneralHealth","PoorPhysicalHealthDays","PoorMentalHealthDays","Asthma","HealthInsurance","CVD","LimitedActivity","Diabetes","employ","BMI","HeavyDrinker","CurrentSmoker","PoorSleepDays","marital","PrematureMortality","HouseholdIncome","ParkAccess","CrimeRate","UnemploymentRate","WaterSafety","HighSchoolRate","SomeCollegeRate","AccessToRecFacility","FastFoodPercentage","PM2.5","PopulationDensity")
# initial model
model.all <- glm(wellbeing ~
age+
SocialSupport+
race+
income+
GeneralHealth+
PoorPhysicalHealthDays+
PoorMentalHealthDays+
Asthma+
HealthInsurance+
CVD+
LimitedActivity+
Diabetes+
employ+
BMI+
HeavyDrinker+
CurrentSmoker+
PoorSleepDays+
marital+
PrematureMortality+
HouseholdIncome+
ParkAccess+
CrimeRate+
UnemploymentRate+
WaterSafety+
HighSchoolRate+
SomeCollegeRate+
AccessToRecFacility+
FastFoodPercentage+
PM2.5+
PopulationDensity,
family=binomial(logit),
data=df_final)
summary(model.all)
##
## Call:
## glm(formula = wellbeing ~ age + SocialSupport + race + income +
## GeneralHealth + PoorPhysicalHealthDays + PoorMentalHealthDays +
## Asthma + HealthInsurance + CVD + LimitedActivity + Diabetes +
## employ + BMI + HeavyDrinker + CurrentSmoker + PoorSleepDays +
## marital + PrematureMortality + HouseholdIncome + ParkAccess +
## CrimeRate + UnemploymentRate + WaterSafety + HighSchoolRate +
## SomeCollegeRate + AccessToRecFacility + FastFoodPercentage +
## PM2.5 + PopulationDensity, family = binomial(logit), data = df_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4901 -0.2319 -0.1418 -0.1017 3.6214
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.790e+00 2.297e-01 -7.796 6.39e-15 ***
## age -3.389e-03 8.947e-04 -3.788 0.000152 ***
## SocialSupport Sometimes -8.370e-01 3.213e-02 -26.048 < 2e-16 ***
## SocialSupportAlways -2.674e+00 3.617e-02 -73.949 < 2e-16 ***
## SocialSupportNever -7.537e-01 4.073e-02 -18.506 < 2e-16 ***
## SocialSupportUsually -2.027e+00 3.450e-02 -58.765 < 2e-16 ***
## raceWhite 3.322e-01 2.681e-02 12.392 < 2e-16 ***
## income[25000-35000) -5.240e-02 3.504e-02 -1.495 0.134815
## income[35000-50000) -1.079e-01 3.572e-02 -3.020 0.002528 **
## income50000 or more -3.257e-01 3.407e-02 -9.560 < 2e-16 ***
## incomeless than 15000 2.193e-02 2.920e-02 0.751 0.452611
## GeneralHealthFair 6.709e-01 4.796e-02 13.989 < 2e-16 ***
## GeneralHealthGood 3.648e-01 4.392e-02 8.305 < 2e-16 ***
## GeneralHealthPoor 1.122e+00 5.505e-02 20.390 < 2e-16 ***
## GeneralHealthVery good 1.954e-01 4.478e-02 4.363 1.28e-05 ***
## PoorPhysicalHealthDays 3.232e-03 1.189e-03 2.718 0.006569 **
## PoorMentalHealthDays 6.163e-02 9.233e-04 66.743 < 2e-16 ***
## AsthmaYes -3.949e-02 2.908e-02 -1.358 0.174478
## HealthInsuranceWith Health Coverage -2.436e-01 2.889e-02 -8.433 < 2e-16 ***
## CVDNo Cardiovascular Disease 4.168e-02 3.589e-02 1.161 0.245546
## LimitedActivityNot Limited -4.983e-01 2.497e-02 -19.958 < 2e-16 ***
## DiabetesNon Diabetic 2.359e-03 2.895e-02 0.081 0.935073
## employRetired -2.884e-02 5.518e-02 -0.523 0.601184
## employSelf-Employed 2.616e-01 6.246e-02 4.189 2.80e-05 ***
## employStudent 2.342e-01 9.294e-02 2.520 0.011739 *
## employUnable to work 4.569e-01 5.459e-02 8.370 < 2e-16 ***
## employUnemployed for less than 1 yr 9.516e-01 6.276e-02 15.162 < 2e-16 ***
## employUnemployed for more than 1 yr 9.374e-01 5.950e-02 15.755 < 2e-16 ***
## employWaged Employement 1.731e-01 5.161e-02 3.353 0.000799 ***
## BMILow 1.101e-03 2.614e-02 0.042 0.966403
## BMIMedium -6.881e-02 2.516e-02 -2.735 0.006234 **
## HeavyDrinkerNot a Heavy Drinker -2.842e-01 4.217e-02 -6.739 1.59e-11 ***
## CurrentSmokerNot a Current Smoker -2.404e-01 2.361e-02 -10.181 < 2e-16 ***
## PoorSleepDays 1.708e-02 9.519e-04 17.941 < 2e-16 ***
## marital1 -4.774e-01 2.349e-02 -20.323 < 2e-16 ***
## PrematureMortality -5.040e-04 2.040e-04 -2.470 0.013494 *
## HouseholdIncome 2.974e-07 1.322e-06 0.225 0.821984
## ParkAccess 2.059e-03 5.702e-04 3.612 0.000304 ***
## CrimeRate 1.037e-04 5.089e-05 2.037 0.041667 *
## UnemploymentRate 7.035e-03 4.673e-03 1.506 0.132176
## WaterSafety -2.527e-03 7.138e-04 -3.540 0.000401 ***
## HighSchoolRate 5.128e-05 1.416e-03 0.036 0.971120
## SomeCollegeRate 7.678e-03 1.534e-03 5.005 5.57e-07 ***
## AccessToRecFacility -3.301e-03 2.659e-03 -1.241 0.214501
## FastFoodPercentage -1.510e-03 1.404e-03 -1.076 0.282062
## PM2.5 4.498e-03 7.338e-03 0.613 0.539909
## PopulationDensity 4.673e-06 2.556e-06 1.828 0.067493 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 115163 on 276348 degrees of freedom
## Residual deviance: 73838 on 276302 degrees of freedom
## AIC: 73932
##
## Number of Fisher Scoring iterations: 7
# library(corrplot)
# library(sjPlot)
rr_county <- df_final[,c(2,22:33)]
rr_cm <- as.matrix(rr_county)
corr_m <- cor(rr_cm, use="pairwise.complete.obs")
# correlation matrix table
tab_corr(rr_county,
na.deletion="pairwise",
corr.method="pearson",
title="Correlations with pair-wise NA deletion",
show.p=TRUE)
| age | PrematureMortality | HouseholdIncome | ParkAccess | CrimeRate | UnemploymentRate | WaterSafety | HighSchoolRate | SomeCollegeRate | AccessToRecFacility | FastFoodPercentage | PM2.5 | PopulationDensity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| age | 0.023*** | -0.056*** | -0.036*** | -0.007*** | 0.020*** | 0.006** | -0.013*** | -0.038*** | -0.003 | -0.051*** | -0.016*** | -0.015*** | |
| PrematureMortality | 0.023*** | -0.681*** | -0.308*** | 0.398*** | 0.336*** | 0.103*** | -0.402*** | -0.609*** | -0.445*** | 0.276*** | 0.208*** | -0.043*** | |
| HouseholdIncome | -0.056*** | -0.681*** | 0.361*** | -0.211*** | -0.361*** | -0.093*** | 0.366*** | 0.660*** | 0.480*** | -0.034*** | -0.093*** | 0.141*** | |
| ParkAccess | -0.036*** | -0.308*** | 0.361*** | 0.290*** | -0.109*** | -0.137*** | -0.115*** | 0.486*** | 0.220*** | 0.176*** | -0.130*** | 0.373*** | |
| CrimeRate | -0.007*** | 0.398*** | -0.211*** | 0.290*** | 0.268*** | -0.092*** | -0.523*** | -0.067*** | -0.122*** | 0.262*** | 0.001 | 0.273*** | |
| UnemploymentRate | 0.020*** | 0.336*** | -0.361*** | -0.109*** | 0.268*** | -0.048*** | -0.329*** | -0.427*** | -0.311*** | 0.043*** | -0.103*** | 0.019*** | |
| WaterSafety | 0.006** | 0.103*** | -0.093*** | -0.137*** | -0.092*** | -0.048*** | 0.048*** | -0.128*** | -0.064*** | 0.021*** | 0.004* | -0.071*** | |
| HighSchoolRate | -0.013*** | -0.402*** | 0.366*** | -0.115*** | -0.523*** | -0.329*** | 0.048*** | 0.242*** | 0.248*** | -0.157*** | -0.071*** | -0.168*** | |
| SomeCollegeRate | -0.038*** | -0.609*** | 0.660*** | 0.486*** | -0.067*** | -0.427*** | -0.128*** | 0.242*** | 0.530*** | -0.024*** | 0.005* | 0.200*** | |
| AccessToRecFacility | -0.003 | -0.445*** | 0.480*** | 0.220*** | -0.122*** | -0.311*** | -0.064*** | 0.248*** | 0.530*** | -0.262*** | -0.009*** | 0.183*** | |
| FastFoodPercentage | -0.051*** | 0.276*** | -0.034*** | 0.176*** | 0.262*** | 0.043*** | 0.021*** | -0.157*** | -0.024*** | -0.262*** | 0.145*** | -0.012*** | |
| PM2.5 | -0.016*** | 0.208*** | -0.093*** | -0.130*** | 0.001 | -0.103*** | 0.004* | -0.071*** | 0.005* | -0.009*** | 0.145*** | -0.020*** | |
| PopulationDensity | -0.015*** | -0.043*** | 0.141*** | 0.373*** | 0.273*** | 0.019*** | -0.071*** | -0.168*** | 0.200*** | 0.183*** | -0.012*** | -0.020*** | |
| Computed correlation used pearson-method with pairwise-deletion. | |||||||||||||
# correlogram
testRes <- cor.mtest(rr_cm, conf.level = 0.95)
corrplot.mixed(corr_m,
upper = "ellipse",
p.mat = testRes$p,
title="Correlation Matrix of Variables of Interest")
##### Highly Correlated Groups
pairs.panels(df_final[, c(23:24,26,29:31)], scale = TRUE, ellipses = FALSE, pch = ".", lm = TRUE, rug = FALSE, stars = TRUE)
library(car)
library(finalfit)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
##
## area
## The following object is masked from 'package:dplyr':
##
## select
vif_all <- vif(model.all)
vif_all <- as.data.frame(vif_all)
#create horizontal bar chart to display each VIF value
vif_plot_all <- vif_all %>%
ggplot(aes(x=rownames(vif_all), y=GVIF)) +
geom_col(color="#434343", fill="#6BD6DB") +
xlab("All Predictors") +
ylab("VIF Values") +
ggtitle("VIF Values for All Predictors") +
geom_hline(aes(yintercept=4),color="#434343", linetype="dashed", linewidth=0.5) +
coord_flip()
vif_plot_all
step <- stepAIC(model.all, direction="both") # Stepwise function
## Start: AIC=73931.82
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + CVD + LimitedActivity + Diabetes + employ +
## BMI + HeavyDrinker + CurrentSmoker + PoorSleepDays + marital +
## PrematureMortality + HouseholdIncome + ParkAccess + CrimeRate +
## UnemploymentRate + WaterSafety + HighSchoolRate + SomeCollegeRate +
## AccessToRecFacility + FastFoodPercentage + PM2.5 + PopulationDensity
##
## Df Deviance AIC
## - HighSchoolRate 1 73838 73930
## - Diabetes 1 73838 73930
## - HouseholdIncome 1 73838 73930
## - PM2.5 1 73838 73930
## - FastFoodPercentage 1 73839 73931
## - CVD 1 73839 73931
## - AccessToRecFacility 1 73839 73931
## - Asthma 1 73840 73932
## <none> 73838 73932
## - UnemploymentRate 1 73840 73932
## - PopulationDensity 1 73841 73933
## - CrimeRate 1 73842 73934
## - PrematureMortality 1 73844 73936
## - PoorPhysicalHealthDays 1 73845 73937
## - BMI 2 73848 73938
## - WaterSafety 1 73851 73943
## - ParkAccess 1 73851 73943
## - age 1 73852 73944
## - SomeCollegeRate 1 73863 73955
## - HeavyDrinker 1 73881 73973
## - HealthInsurance 1 73908 74000
## - CurrentSmoker 1 73940 74032
## - income 4 73948 74034
## - race 1 73995 74087
## - PoorSleepDays 1 74154 74246
## - LimitedActivity 1 74231 74323
## - marital 1 74256 74348
## - GeneralHealth 4 74386 74472
## - employ 7 74564 74644
## - PoorMentalHealthDays 1 78138 78230
## - SocialSupport 4 81876 81962
##
## Step: AIC=73929.82
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + CVD + LimitedActivity + Diabetes + employ +
## BMI + HeavyDrinker + CurrentSmoker + PoorSleepDays + marital +
## PrematureMortality + HouseholdIncome + ParkAccess + CrimeRate +
## UnemploymentRate + WaterSafety + SomeCollegeRate + AccessToRecFacility +
## FastFoodPercentage + PM2.5 + PopulationDensity
##
## Df Deviance AIC
## - Diabetes 1 73838 73928
## - HouseholdIncome 1 73838 73928
## - PM2.5 1 73838 73928
## - FastFoodPercentage 1 73839 73929
## - CVD 1 73839 73929
## - AccessToRecFacility 1 73839 73929
## - Asthma 1 73840 73930
## <none> 73838 73930
## - UnemploymentRate 1 73840 73930
## - PopulationDensity 1 73841 73931
## + HighSchoolRate 1 73838 73932
## - CrimeRate 1 73842 73932
## - PrematureMortality 1 73844 73934
## - PoorPhysicalHealthDays 1 73845 73935
## - BMI 2 73848 73936
## - WaterSafety 1 73851 73941
## - ParkAccess 1 73851 73941
## - age 1 73852 73942
## - SomeCollegeRate 1 73863 73953
## - HeavyDrinker 1 73881 73971
## - HealthInsurance 1 73908 73998
## - CurrentSmoker 1 73940 74030
## - income 4 73948 74032
## - race 1 73995 74085
## - PoorSleepDays 1 74154 74244
## - LimitedActivity 1 74232 74322
## - marital 1 74256 74346
## - GeneralHealth 4 74386 74470
## - employ 7 74564 74642
## - PoorMentalHealthDays 1 78138 78228
## - SocialSupport 4 81876 81960
##
## Step: AIC=73927.83
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + CVD + LimitedActivity + employ + BMI +
## HeavyDrinker + CurrentSmoker + PoorSleepDays + marital +
## PrematureMortality + HouseholdIncome + ParkAccess + CrimeRate +
## UnemploymentRate + WaterSafety + SomeCollegeRate + AccessToRecFacility +
## FastFoodPercentage + PM2.5 + PopulationDensity
##
## Df Deviance AIC
## - HouseholdIncome 1 73838 73926
## - PM2.5 1 73838 73926
## - FastFoodPercentage 1 73839 73927
## - CVD 1 73839 73927
## - AccessToRecFacility 1 73839 73927
## - Asthma 1 73840 73928
## <none> 73838 73928
## - UnemploymentRate 1 73840 73928
## - PopulationDensity 1 73841 73929
## + Diabetes 1 73838 73930
## + HighSchoolRate 1 73838 73930
## - CrimeRate 1 73842 73930
## - PrematureMortality 1 73844 73932
## - PoorPhysicalHealthDays 1 73845 73933
## - BMI 2 73848 73934
## - WaterSafety 1 73851 73939
## - ParkAccess 1 73851 73939
## - age 1 73852 73940
## - SomeCollegeRate 1 73863 73951
## - HeavyDrinker 1 73881 73969
## - HealthInsurance 1 73908 73996
## - CurrentSmoker 1 73940 74028
## - income 4 73948 74030
## - race 1 73996 74084
## - PoorSleepDays 1 74154 74242
## - LimitedActivity 1 74232 74320
## - marital 1 74256 74344
## - GeneralHealth 4 74394 74476
## - employ 7 74564 74640
## - PoorMentalHealthDays 1 78139 78227
## - SocialSupport 4 81876 81958
##
## Step: AIC=73925.88
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + CVD + LimitedActivity + employ + BMI +
## HeavyDrinker + CurrentSmoker + PoorSleepDays + marital +
## PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate +
## WaterSafety + SomeCollegeRate + AccessToRecFacility + FastFoodPercentage +
## PM2.5 + PopulationDensity
##
## Df Deviance AIC
## - PM2.5 1 73838 73924
## - FastFoodPercentage 1 73839 73925
## - CVD 1 73839 73925
## - AccessToRecFacility 1 73839 73925
## - Asthma 1 73840 73926
## <none> 73838 73926
## - UnemploymentRate 1 73840 73926
## - PopulationDensity 1 73841 73927
## + HouseholdIncome 1 73838 73928
## + Diabetes 1 73838 73928
## + HighSchoolRate 1 73838 73928
## - CrimeRate 1 73842 73928
## - PoorPhysicalHealthDays 1 73845 73931
## - PrematureMortality 1 73846 73932
## - BMI 2 73848 73932
## - WaterSafety 1 73851 73937
## - ParkAccess 1 73851 73937
## - age 1 73852 73938
## - SomeCollegeRate 1 73866 73952
## - HeavyDrinker 1 73881 73967
## - HealthInsurance 1 73908 73994
## - CurrentSmoker 1 73940 74026
## - income 4 73949 74029
## - race 1 73996 74082
## - PoorSleepDays 1 74154 74240
## - LimitedActivity 1 74232 74318
## - marital 1 74257 74343
## - GeneralHealth 4 74394 74474
## - employ 7 74565 74639
## - PoorMentalHealthDays 1 78140 78226
## - SocialSupport 4 81877 81957
##
## Step: AIC=73924.25
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + CVD + LimitedActivity + employ + BMI +
## HeavyDrinker + CurrentSmoker + PoorSleepDays + marital +
## PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate +
## WaterSafety + SomeCollegeRate + AccessToRecFacility + FastFoodPercentage +
## PopulationDensity
##
## Df Deviance AIC
## - FastFoodPercentage 1 73839 73923
## - CVD 1 73840 73924
## - AccessToRecFacility 1 73840 73924
## - Asthma 1 73840 73924
## <none> 73838 73924
## - UnemploymentRate 1 73840 73924
## - PopulationDensity 1 73842 73926
## + PM2.5 1 73838 73926
## + HouseholdIncome 1 73838 73926
## + Diabetes 1 73838 73926
## + HighSchoolRate 1 73838 73926
## - CrimeRate 1 73843 73927
## - PoorPhysicalHealthDays 1 73846 73930
## - PrematureMortality 1 73846 73930
## - BMI 2 73848 73930
## - ParkAccess 1 73851 73935
## - WaterSafety 1 73851 73935
## - age 1 73853 73937
## - SomeCollegeRate 1 73867 73951
## - HeavyDrinker 1 73882 73966
## - HealthInsurance 1 73908 73992
## - CurrentSmoker 1 73941 74025
## - income 4 73949 74027
## - race 1 73998 74082
## - PoorSleepDays 1 74155 74239
## - LimitedActivity 1 74232 74316
## - marital 1 74258 74342
## - GeneralHealth 4 74394 74472
## - employ 7 74565 74637
## - PoorMentalHealthDays 1 78140 78224
## - SocialSupport 4 81877 81955
##
## Step: AIC=73923.22
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + CVD + LimitedActivity + employ + BMI +
## HeavyDrinker + CurrentSmoker + PoorSleepDays + marital +
## PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate +
## WaterSafety + SomeCollegeRate + AccessToRecFacility + PopulationDensity
##
## Df Deviance AIC
## - AccessToRecFacility 1 73840 73922
## - CVD 1 73841 73923
## - Asthma 1 73841 73923
## <none> 73839 73923
## - UnemploymentRate 1 73841 73923
## + FastFoodPercentage 1 73838 73924
## + PM2.5 1 73839 73925
## - PopulationDensity 1 73843 73925
## + Diabetes 1 73839 73925
## + HouseholdIncome 1 73839 73925
## + HighSchoolRate 1 73839 73925
## - CrimeRate 1 73843 73925
## - PoorPhysicalHealthDays 1 73847 73929
## - BMI 2 73849 73929
## - PrematureMortality 1 73849 73931
## - ParkAccess 1 73851 73933
## - WaterSafety 1 73852 73934
## - age 1 73854 73936
## - SomeCollegeRate 1 73868 73950
## - HeavyDrinker 1 73883 73965
## - HealthInsurance 1 73909 73991
## - CurrentSmoker 1 73942 74024
## - income 4 73951 74027
## - race 1 74000 74082
## - PoorSleepDays 1 74155 74237
## - LimitedActivity 1 74233 74315
## - marital 1 74259 74341
## - GeneralHealth 4 74395 74471
## - employ 7 74567 74637
## - PoorMentalHealthDays 1 78141 78223
## - SocialSupport 4 81879 81955
##
## Step: AIC=73922.24
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + CVD + LimitedActivity + employ + BMI +
## HeavyDrinker + CurrentSmoker + PoorSleepDays + marital +
## PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate +
## WaterSafety + SomeCollegeRate + PopulationDensity
##
## Df Deviance AIC
## - CVD 1 73842 73922
## - Asthma 1 73842 73922
## <none> 73840 73922
## - UnemploymentRate 1 73843 73923
## + AccessToRecFacility 1 73839 73923
## - PopulationDensity 1 73843 73923
## + FastFoodPercentage 1 73840 73924
## + PM2.5 1 73840 73924
## + HouseholdIncome 1 73840 73924
## + HighSchoolRate 1 73840 73924
## + Diabetes 1 73840 73924
## - CrimeRate 1 73844 73924
## - PoorPhysicalHealthDays 1 73848 73928
## - BMI 2 73850 73928
## - PrematureMortality 1 73849 73929
## - ParkAccess 1 73853 73933
## - WaterSafety 1 73853 73933
## - age 1 73855 73935
## - SomeCollegeRate 1 73868 73948
## - HeavyDrinker 1 73884 73964
## - HealthInsurance 1 73910 73990
## - CurrentSmoker 1 73943 74023
## - income 4 73952 74026
## - race 1 74001 74081
## - PoorSleepDays 1 74156 74236
## - LimitedActivity 1 74235 74315
## - marital 1 74260 74340
## - GeneralHealth 4 74396 74470
## - employ 7 74567 74635
## - PoorMentalHealthDays 1 78141 78221
## - SocialSupport 4 81880 81954
##
## Step: AIC=73921.59
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker +
## CurrentSmoker + PoorSleepDays + marital + PrematureMortality +
## ParkAccess + CrimeRate + UnemploymentRate + WaterSafety +
## SomeCollegeRate + PopulationDensity
##
## Df Deviance AIC
## - Asthma 1 73844 73922
## <none> 73842 73922
## - UnemploymentRate 1 73844 73922
## + CVD 1 73840 73922
## + AccessToRecFacility 1 73841 73923
## - PopulationDensity 1 73845 73923
## + FastFoodPercentage 1 73841 73923
## + PM2.5 1 73841 73923
## + Diabetes 1 73842 73924
## + HouseholdIncome 1 73842 73924
## + HighSchoolRate 1 73842 73924
## - CrimeRate 1 73846 73924
## - PoorPhysicalHealthDays 1 73849 73927
## - BMI 2 73852 73928
## - PrematureMortality 1 73850 73928
## - ParkAccess 1 73854 73932
## - WaterSafety 1 73855 73933
## - age 1 73857 73935
## - SomeCollegeRate 1 73870 73948
## - HeavyDrinker 1 73885 73963
## - HealthInsurance 1 73912 73990
## - CurrentSmoker 1 73944 74022
## - income 4 73953 74025
## - race 1 74002 74080
## - PoorSleepDays 1 74157 74235
## - LimitedActivity 1 74235 74313
## - marital 1 74262 74340
## - GeneralHealth 4 74400 74472
## - employ 7 74569 74635
## - PoorMentalHealthDays 1 78145 78223
## - SocialSupport 4 81882 81954
##
## Step: AIC=73921.58
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + HealthInsurance +
## LimitedActivity + employ + BMI + HeavyDrinker + CurrentSmoker +
## PoorSleepDays + marital + PrematureMortality + ParkAccess +
## CrimeRate + UnemploymentRate + WaterSafety + SomeCollegeRate +
## PopulationDensity
##
## Df Deviance AIC
## <none> 73844 73922
## + Asthma 1 73842 73922
## + CVD 1 73842 73922
## - UnemploymentRate 1 73846 73922
## + AccessToRecFacility 1 73843 73923
## - PopulationDensity 1 73847 73923
## + FastFoodPercentage 1 73843 73923
## + PM2.5 1 73843 73923
## + Diabetes 1 73844 73924
## + HouseholdIncome 1 73844 73924
## + HighSchoolRate 1 73844 73924
## - CrimeRate 1 73848 73924
## - PoorPhysicalHealthDays 1 73851 73927
## - BMI 2 73854 73928
## - PrematureMortality 1 73852 73928
## - ParkAccess 1 73856 73932
## - WaterSafety 1 73857 73933
## - age 1 73859 73935
## - SomeCollegeRate 1 73872 73948
## - HeavyDrinker 1 73887 73963
## - HealthInsurance 1 73915 73991
## - CurrentSmoker 1 73946 74022
## - income 4 73955 74025
## - race 1 74004 74080
## - PoorSleepDays 1 74157 74233
## - LimitedActivity 1 74235 74311
## - marital 1 74263 74339
## - GeneralHealth 4 74400 74470
## - employ 7 74570 74634
## - PoorMentalHealthDays 1 78145 78221
## - SocialSupport 4 81887 81957
model.step <- glm(formula= df_final$wellbeing ~ . ,
df_final[,(colnames(step$model[-1]))], family=binomial(logit))
summary(model.step)
##
## Call:
## glm(formula = df_final$wellbeing ~ ., family = binomial(logit),
## data = df_final[, (colnames(step$model[-1]))])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5070 -0.2318 -0.1418 -0.1017 3.6267
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.753e+00 1.675e-01 -10.468 < 2e-16 ***
## age -3.471e-03 8.841e-04 -3.926 8.64e-05 ***
## SocialSupport Sometimes -8.367e-01 3.212e-02 -26.045 < 2e-16 ***
## SocialSupportAlways -2.675e+00 3.616e-02 -73.970 < 2e-16 ***
## SocialSupportNever -7.535e-01 4.072e-02 -18.506 < 2e-16 ***
## SocialSupportUsually -2.027e+00 3.449e-02 -58.778 < 2e-16 ***
## raceWhite 3.330e-01 2.667e-02 12.484 < 2e-16 ***
## income[25000-35000) -5.192e-02 3.503e-02 -1.482 0.138265
## income[35000-50000) -1.072e-01 3.569e-02 -3.004 0.002667 **
## income50000 or more -3.254e-01 3.388e-02 -9.603 < 2e-16 ***
## incomeless than 15000 2.021e-02 2.917e-02 0.693 0.488428
## GeneralHealthFair 6.651e-01 4.773e-02 13.935 < 2e-16 ***
## GeneralHealthGood 3.628e-01 4.388e-02 8.268 < 2e-16 ***
## GeneralHealthPoor 1.112e+00 5.445e-02 20.420 < 2e-16 ***
## GeneralHealthVery good 1.951e-01 4.478e-02 4.356 1.32e-05 ***
## PoorPhysicalHealthDays 3.207e-03 1.189e-03 2.697 0.006999 **
## PoorMentalHealthDays 6.159e-02 9.227e-04 66.753 < 2e-16 ***
## HealthInsuranceWith Health Coverage -2.451e-01 2.884e-02 -8.496 < 2e-16 ***
## LimitedActivityNot Limited -4.957e-01 2.490e-02 -19.908 < 2e-16 ***
## employRetired -2.974e-02 5.516e-02 -0.539 0.589730
## employSelf-Employed 2.631e-01 6.243e-02 4.214 2.51e-05 ***
## employStudent 2.329e-01 9.292e-02 2.507 0.012179 *
## employUnable to work 4.547e-01 5.455e-02 8.335 < 2e-16 ***
## employUnemployed for less than 1 yr 9.517e-01 6.275e-02 15.168 < 2e-16 ***
## employUnemployed for more than 1 yr 9.380e-01 5.947e-02 15.773 < 2e-16 ***
## employWaged Employement 1.741e-01 5.160e-02 3.374 0.000741 ***
## BMILow 4.723e-03 2.556e-02 0.185 0.853399
## BMIMedium -6.638e-02 2.487e-02 -2.669 0.007603 **
## HeavyDrinkerNot a Heavy Drinker -2.851e-01 4.214e-02 -6.764 1.34e-11 ***
## CurrentSmokerNot a Current Smoker -2.406e-01 2.359e-02 -10.199 < 2e-16 ***
## PoorSleepDays 1.699e-02 9.506e-04 17.870 < 2e-16 ***
## marital1 -4.774e-01 2.347e-02 -20.344 < 2e-16 ***
## PrematureMortality -5.103e-04 1.723e-04 -2.963 0.003051 **
## ParkAccess 1.955e-03 5.490e-04 3.562 0.000368 ***
## CrimeRate 9.820e-05 4.787e-05 2.052 0.040209 *
## UnemploymentRate 7.354e-03 4.563e-03 1.611 0.107094
## WaterSafety -2.551e-03 7.126e-04 -3.579 0.000344 ***
## SomeCollegeRate 7.273e-03 1.382e-03 5.265 1.40e-07 ***
## PopulationDensity 4.517e-06 2.476e-06 1.824 0.068084 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 115163 on 276348 degrees of freedom
## Residual deviance: 73844 on 276310 degrees of freedom
## AIC: 73922
##
## Number of Fisher Scoring iterations: 7
# setup variable
pred.m1 <- c("age","SocialSupport","race","income","GeneralHealth","PoorPhysicalHealthDays","PoorMentalHealthDays","HealthInsurance","LimitedActivity","employ","BMI","HeavyDrinker","CurrentSmoker","PoorSleepDays","marital","PrematureMortality","ParkAccess","CrimeRate","UnemploymentRate","WaterSafety","SomeCollegeRate")
# model with reduced predictors
model.1 <- glm(wellbeing ~
age+
SocialSupport+
race+
income+
GeneralHealth+
PoorPhysicalHealthDays+
PoorMentalHealthDays+
HealthInsurance+
LimitedActivity+
employ+
BMI+
HeavyDrinker+
CurrentSmoker+
PoorSleepDays+
marital+
PrematureMortality+
ParkAccess+
CrimeRate+
UnemploymentRate+
WaterSafety+
SomeCollegeRate,
family=binomial(logit),
data=df_final)
summary(model.1)
##
## Call:
## glm(formula = wellbeing ~ age + SocialSupport + race + income +
## GeneralHealth + PoorPhysicalHealthDays + PoorMentalHealthDays +
## HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker +
## CurrentSmoker + PoorSleepDays + marital + PrematureMortality +
## ParkAccess + CrimeRate + UnemploymentRate + WaterSafety +
## SomeCollegeRate, family = binomial(logit), data = df_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5102 -0.2318 -0.1419 -0.1017 3.6271
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.779e+00 1.669e-01 -10.656 < 2e-16 ***
## age -3.456e-03 8.842e-04 -3.909 9.27e-05 ***
## SocialSupport Sometimes -8.361e-01 3.212e-02 -26.029 < 2e-16 ***
## SocialSupportAlways -2.675e+00 3.616e-02 -73.972 < 2e-16 ***
## SocialSupportNever -7.526e-01 4.071e-02 -18.486 < 2e-16 ***
## SocialSupportUsually -2.027e+00 3.449e-02 -58.775 < 2e-16 ***
## raceWhite 3.320e-01 2.667e-02 12.450 < 2e-16 ***
## income[25000-35000) -5.231e-02 3.503e-02 -1.493 0.135317
## income[35000-50000) -1.074e-01 3.569e-02 -3.008 0.002627 **
## income50000 or more -3.245e-01 3.388e-02 -9.578 < 2e-16 ***
## incomeless than 15000 2.007e-02 2.917e-02 0.688 0.491432
## GeneralHealthFair 6.654e-01 4.773e-02 13.941 < 2e-16 ***
## GeneralHealthGood 3.627e-01 4.388e-02 8.265 < 2e-16 ***
## GeneralHealthPoor 1.112e+00 5.444e-02 20.420 < 2e-16 ***
## GeneralHealthVery good 1.949e-01 4.478e-02 4.353 1.34e-05 ***
## PoorPhysicalHealthDays 3.212e-03 1.189e-03 2.701 0.006907 **
## PoorMentalHealthDays 6.160e-02 9.227e-04 66.758 < 2e-16 ***
## HealthInsuranceWith Health Coverage -2.447e-01 2.884e-02 -8.482 < 2e-16 ***
## LimitedActivityNot Limited -4.954e-01 2.490e-02 -19.894 < 2e-16 ***
## employRetired -2.945e-02 5.516e-02 -0.534 0.593396
## employSelf-Employed 2.651e-01 6.241e-02 4.247 2.16e-05 ***
## employStudent 2.334e-01 9.291e-02 2.512 0.012003 *
## employUnable to work 4.553e-01 5.455e-02 8.347 < 2e-16 ***
## employUnemployed for less than 1 yr 9.523e-01 6.275e-02 15.176 < 2e-16 ***
## employUnemployed for more than 1 yr 9.390e-01 5.947e-02 15.790 < 2e-16 ***
## employWaged Employement 1.746e-01 5.160e-02 3.383 0.000717 ***
## BMILow 5.699e-03 2.556e-02 0.223 0.823530
## BMIMedium -6.593e-02 2.487e-02 -2.651 0.008021 **
## HeavyDrinkerNot a Heavy Drinker -2.847e-01 4.215e-02 -6.755 1.42e-11 ***
## CurrentSmokerNot a Current Smoker -2.403e-01 2.359e-02 -10.183 < 2e-16 ***
## PoorSleepDays 1.699e-02 9.506e-04 17.869 < 2e-16 ***
## marital1 -4.783e-01 2.346e-02 -20.388 < 2e-16 ***
## PrematureMortality -5.024e-04 1.722e-04 -2.917 0.003531 **
## ParkAccess 2.188e-03 5.332e-04 4.103 4.08e-05 ***
## CrimeRate 1.087e-04 4.752e-05 2.287 0.022187 *
## UnemploymentRate 7.581e-03 4.559e-03 1.663 0.096362 .
## WaterSafety -2.553e-03 7.124e-04 -3.584 0.000339 ***
## SomeCollegeRate 7.457e-03 1.378e-03 5.411 6.25e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 115163 on 276348 degrees of freedom
## Residual deviance: 73847 on 276311 degrees of freedom
## AIC: 73923
##
## Number of Fisher Scoring iterations: 7
# library(corrplot)
# library(sjPlot)
rr_county.m1 <- df_final[,c(2,22, 24:27,29)]
rr_cm.m1 <- as.matrix(rr_county.m1)
corr_m1 <- cor(rr_cm.m1, use="pairwise.complete.obs")
# correlogram
testRes.m1 <- cor.mtest(rr_cm.m1, conf.level = 0.95)
corrplot.mixed(corr_m1,
upper = "ellipse",
p.mat = testRes.m1$p,
title="Correlation Matrix of Variables of Interest")
vif_1 <- vif(model.1)
vif_1 <- as.data.frame(vif_1)
#create horizontal bar chart to display each VIF value
vif_plot_1 <- vif_1 %>%
ggplot(aes(x=rownames(vif_1), y=GVIF)) +
geom_col(color="#434343", fill="#FFB87B") +
xlab("Statistically Significant Predictors") +
ylab("VIF Values") +
ggtitle("VIF Values for Reduced Model") +
geom_hline(aes(yintercept=4),color="#434343", linetype="dashed", linewidth=0.5) +
coord_flip()
vif_plot_1
# library(broom)
model.1 %>%
tidy(conf.int=TRUE, exp=TRUE) %>%
datatable(rownames=FALSE,
colnames=c("predictor", "odds ratio", "std error", "statistic", "p-value", "CI low", "CI high"),
options=list(pageLength=15, scrollX=T))
# forest plot
# library(finalfit)
#df_final %>%
# or_plot(dependent, pred.m1)
## set the seed to make your partition reproducible
set.seed(123)
train_index <- sample(seq_len(nrow(df_final)), size = 0.75*nrow(df_final))
train <- df_final[train_index, ]
test <- df_final[-train_index, ]
nrow(train)
## [1] 207261
nrow(test)
## [1] 69088
pacman::p_load(caret)
glm_model <- train(wellbeing ~ age + SocialSupport + race + income + GeneralHealth + PoorPhysicalHealthDays + PoorMentalHealthDays + HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker + CurrentSmoker + PoorSleepDays + marital + PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate + WaterSafety + SomeCollegeRate, data = train, method = "glm", family = "binomial")
attributes(glm_model)
## $names
## [1] "method" "modelInfo" "modelType" "results" "pred"
## [6] "bestTune" "call" "dots" "metric" "control"
## [11] "finalModel" "preProcess" "trainingData" "ptype" "resample"
## [16] "resampledCM" "perfNames" "maximize" "yLimits" "times"
## [21] "levels" "terms" "coefnames" "contrasts" "xlevels"
##
## $class
## [1] "train" "train.formula"
glm_model$metric
## [1] "Accuracy"
glm_model$results
## parameter Accuracy Kappa AccuracySD KappaSD
## 1 none 0.9519952 0.3577324 0.0004168047 0.006218255
test$pred <- glm_model %>% predict(test)
accuracy_function = function(real, predicted) {
mean(real == predicted)
}
accuracy_function(real = test$wellbeing,
predicted = test$pred)
## [1] 0.9523362
ggplot(varImp(glm_model))
library(rattle)
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:sjstats':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:rattle':
##
## xgboost
## The following object is masked from 'package:dplyr':
##
## slice
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:psych':
##
## outlier
set.seed(123)
train_index <- sample(seq_len(nrow(df_final)), size = 0.75*nrow(df_final))
train <- df_final[train_index, ]
test <- df_final[-train_index, ]
cat(nrow(test),nrow(train))
## 69088 207261
levels(train$wellbeing) <- c("Good_Wellbeing", "Poor_Wellbeing")
trcntrl_ho<-trainControl(classProbs = TRUE,summaryFunction = twoClassSummary)
model_CART <- train(wellbeing ~ age + SocialSupport + race + income + GeneralHealth + PoorPhysicalHealthDays + PoorMentalHealthDays + HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker + CurrentSmoker + PoorSleepDays + marital + PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate + WaterSafety + SomeCollegeRate , data = train, method = "rpart",trControl=trcntrl_ho,metric="ROC",tuneLength = 10)
model_CART$results
## cp ROC Sens Spec ROCSD SensSD
## 1 0.0004980080 0.7472576 0.9910863 0.2318332 0.022547168 0.0007790825
## 2 0.0005432814 0.7431999 0.9914031 0.2291817 0.005868611 0.0006624901
## 3 0.0006338283 0.7437871 0.9916812 0.2279633 0.005898479 0.0007834401
## 4 0.0007243752 0.7442818 0.9917585 0.2291082 0.005799676 0.0007911671
## 5 0.0013582035 0.7450477 0.9922785 0.2234589 0.005537854 0.0008667502
## 6 0.0015392974 0.7450434 0.9923658 0.2229886 0.005544318 0.0009054454
## 7 0.0020373053 0.7450245 0.9922127 0.2255714 0.005558849 0.0011004996
## 8 0.0024447664 0.7449736 0.9924919 0.2196554 0.005586875 0.0012487124
## 9 0.0143969576 0.7248273 0.9938143 0.1772921 0.067884816 0.0026939664
## 10 0.0171133647 0.6446700 0.9946944 0.1322237 0.120574485 0.0044303138
## SpecSD
## 1 0.010431924
## 2 0.008608276
## 3 0.011127379
## 4 0.011049154
## 5 0.014123109
## 6 0.016082827
## 7 0.020373734
## 8 0.024168458
## 9 0.059100999
## 10 0.110228244
plot(model_CART)
ggplot(varImp(model_CART))
print(varImp(model_CART))
## rpart variable importance
##
## only 20 most important variables shown (out of 43)
##
## Overall
## PoorMentalHealthDays 100.0000
## LimitedActivityNot Limited 71.1397
## GeneralHealthPoor 67.0912
## employUnable to work 60.5110
## PoorPhysicalHealthDays 55.0010
## SocialSupportUsually 25.3124
## SocialSupportAlways 20.0508
## marital1 17.5696
## SocialSupport Sometimes 8.3734
## age 4.0841
## employRetired 2.1720
## raceWhite 1.9972
## PoorSleepDays 1.9718
## PrematureMortality 1.4345
## CrimeRate 1.4050
## UnemploymentRate 1.3888
## employUnemployed for less than 1 yr 1.3424
## SomeCollegeRate 1.2201
## SocialSupportNever 0.9717
## ParkAccess 0.6516
plot(model_CART$finalModel, uniform=TRUE,
main="Classification Tree")
text(model_CART$finalModel, all=FALSE, cex=.7)
# Using the Rattle Library
library(rattle)
fancyRpartPlot(model_CART$finalModel,cex = 0.9)
test$pred_cart <- model_CART %>% predict(test)
prob_cart<- model_CART %>% predict(test,type='prob')
confusionMtx=table(test$pred_cart,test$wellbeing)
confusionMtx
##
## Good_Wellbeing Poor_Wellbeing
## Good_Wellbeing 64907 2890
## Poor_Wellbeing 472 819
TP=confusionMtx[1,1]
FP=confusionMtx[1,2]
TN=confusionMtx[2,2]
FN=confusionMtx[2,1]
TP
## [1] 64907
recall=TP/(TP+FN)
recall
## [1] 0.9927806
Accuracy=(TP+TN)/nrow(test)
Accuracy
## [1] 0.9513374
specificity=TN/(TN+FP)
specificity
## [1] 0.2208142
precision=TP/(TP+FP)
precision
## [1] 0.9573727
F1=(2*recall*precision)/(recall+precision)
F1
## [1] 0.9747552
str(prob_cart)
## 'data.frame': 69088 obs. of 2 variables:
## $ Good_Wellbeing: num 0.973 0.973 0.973 0.973 0.973 ...
## $ Poor_Wellbeing: num 0.0272 0.0272 0.0272 0.0272 0.0272 ...
CART.roc=roc(predictor=prob_cart$Good_Wellbeing,response=test$wellbeing,
levels=levels(test$wellbeing))
## Setting direction: controls > cases
CART.roc
##
## Call:
## roc.default(response = test$wellbeing, predictor = prob_cart$Good_Wellbeing, levels = levels(test$wellbeing))
##
## Data: prob_cart$Good_Wellbeing in 65379 controls (test$wellbeing Good_Wellbeing) > 3709 cases (test$wellbeing Poor_Wellbeing).
## Area under the curve: 0.7341
ggroc(CART.roc, alpha = 0.5, colour = "red", linetype = 2, size =0.7 )